Read in the data:

Credit <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Credit.csv")
Credit$Utilization <- Credit$Balance / (Credit$Income*100)
summary(Credit)
       X             Income           Limit           Rating     
 Min.   :  1.0   Min.   : 10.35   Min.   :  855   Min.   : 93.0  
 1st Qu.:100.8   1st Qu.: 21.01   1st Qu.: 3088   1st Qu.:247.2  
 Median :200.5   Median : 33.12   Median : 4622   Median :344.0  
 Mean   :200.5   Mean   : 45.22   Mean   : 4736   Mean   :354.9  
 3rd Qu.:300.2   3rd Qu.: 57.47   3rd Qu.: 5873   3rd Qu.:437.2  
 Max.   :400.0   Max.   :186.63   Max.   :13913   Max.   :982.0  
     Cards            Age          Education        Gender    Student  
 Min.   :1.000   Min.   :23.00   Min.   : 5.00    Male :193   No :360  
 1st Qu.:2.000   1st Qu.:41.75   1st Qu.:11.00   Female:207   Yes: 40  
 Median :3.000   Median :56.00   Median :14.00                         
 Mean   :2.958   Mean   :55.67   Mean   :13.45                         
 3rd Qu.:4.000   3rd Qu.:70.00   3rd Qu.:16.00                         
 Max.   :9.000   Max.   :98.00   Max.   :20.00                         
 Married              Ethnicity      Balance         Utilization     
 No :155   African American: 99   Min.   :   0.00   Min.   :0.00000  
 Yes:245   Asian           :102   1st Qu.:  68.75   1st Qu.:0.01521  
           Caucasian       :199   Median : 459.50   Median :0.09976  
                                  Mean   : 520.01   Mean   :0.15120  
                                  3rd Qu.: 863.00   3rd Qu.:0.21266  
                                  Max.   :1999.00   Max.   :1.12156  
Credit <- Credit[ ,-1]
DT::datatable(Credit, rownames = FALSE)

Best subsets regression

library(leaps)
regfit.full <- regsubsets(Balance ~. , data = Credit, nvmax = 12)
summary(regfit.full)
Subset selection object
Call: regsubsets.formula(Balance ~ ., data = Credit, nvmax = 12)
12 Variables  (and intercept)
                   Forced in Forced out
Income                 FALSE      FALSE
Limit                  FALSE      FALSE
Rating                 FALSE      FALSE
Cards                  FALSE      FALSE
Age                    FALSE      FALSE
Education              FALSE      FALSE
GenderFemale           FALSE      FALSE
StudentYes             FALSE      FALSE
MarriedYes             FALSE      FALSE
EthnicityAsian         FALSE      FALSE
EthnicityCaucasian     FALSE      FALSE
Utilization            FALSE      FALSE
1 subsets of each size up to 12
Selection Algorithm: exhaustive
          Income Limit Rating Cards Age Education GenderFemale StudentYes
1  ( 1 )  " "    " "   "*"    " "   " " " "       " "          " "       
2  ( 1 )  " "    " "   "*"    " "   " " " "       " "          " "       
3  ( 1 )  "*"    " "   "*"    " "   " " " "       " "          "*"       
4  ( 1 )  "*"    "*"   " "    "*"   " " " "       " "          "*"       
5  ( 1 )  "*"    "*"   " "    "*"   " " " "       " "          "*"       
6  ( 1 )  "*"    "*"   "*"    "*"   " " " "       " "          "*"       
7  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       " "          "*"       
8  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
9  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
10  ( 1 ) "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
11  ( 1 ) "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
12  ( 1 ) "*"    "*"   "*"    "*"   "*" "*"       "*"          "*"       
          MarriedYes EthnicityAsian EthnicityCaucasian Utilization
1  ( 1 )  " "        " "            " "                " "        
2  ( 1 )  " "        " "            " "                "*"        
3  ( 1 )  " "        " "            " "                " "        
4  ( 1 )  " "        " "            " "                " "        
5  ( 1 )  " "        " "            " "                "*"        
6  ( 1 )  " "        " "            " "                "*"        
7  ( 1 )  " "        " "            " "                "*"        
8  ( 1 )  " "        " "            " "                "*"        
9  ( 1 )  "*"        " "            " "                "*"        
10  ( 1 ) "*"        "*"            " "                "*"        
11  ( 1 ) "*"        "*"            "*"                "*"        
12  ( 1 ) "*"        "*"            "*"                "*"        
summary(regfit.full)$rsq
 [1] 0.7458484 0.8855145 0.9498788 0.9535800 0.9546034 0.9551550 0.9555780
 [8] 0.9556808 0.9557625 0.9558333 0.9558954 0.9559304
reg.summary <- summary(regfit.full)

Picking

par(mfrow = c(2, 2))
plot(reg.summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg.summary$adjr2, xlab = "Number of Variables", ylab = " Adjusted RSq", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red",cex = 2, pch = 20)
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp",
type ="l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC",
type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)

par(mfrow = c(1, 1))

Using the generic leaps plotting functions

par(mfrow = c(2, 2))
plot(regfit.full, scale = "r2")
plot(regfit.full, scale = "adjr2")
plot(regfit.full, scale = "Cp")
plot(regfit.full, scale = "bic")

par(mfrow = c(1, 1))

What are the coefficients selected with BIC?

which.min(reg.summary$bic)
[1] 5
coef(regfit.full, which.min(reg.summary$bic))
 (Intercept)       Income        Limit        Cards   StudentYes 
-490.4886212   -6.8997534    0.2525404   21.1105879  406.3008916 
 Utilization 
 155.4748273 

Forward selection with leaps

regfit.fwd <- regsubsets(Balance ~. , data = Credit , nvmax = 12,
method = "forward")
summary(regfit.fwd)
Subset selection object
Call: regsubsets.formula(Balance ~ ., data = Credit, nvmax = 12, method = "forward")
12 Variables  (and intercept)
                   Forced in Forced out
Income                 FALSE      FALSE
Limit                  FALSE      FALSE
Rating                 FALSE      FALSE
Cards                  FALSE      FALSE
Age                    FALSE      FALSE
Education              FALSE      FALSE
GenderFemale           FALSE      FALSE
StudentYes             FALSE      FALSE
MarriedYes             FALSE      FALSE
EthnicityAsian         FALSE      FALSE
EthnicityCaucasian     FALSE      FALSE
Utilization            FALSE      FALSE
1 subsets of each size up to 12
Selection Algorithm: forward
          Income Limit Rating Cards Age Education GenderFemale StudentYes
1  ( 1 )  " "    " "   "*"    " "   " " " "       " "          " "       
2  ( 1 )  " "    " "   "*"    " "   " " " "       " "          " "       
3  ( 1 )  " "    " "   "*"    " "   " " " "       " "          "*"       
4  ( 1 )  "*"    " "   "*"    " "   " " " "       " "          "*"       
5  ( 1 )  "*"    "*"   "*"    " "   " " " "       " "          "*"       
6  ( 1 )  "*"    "*"   "*"    "*"   " " " "       " "          "*"       
7  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       " "          "*"       
8  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
9  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
10  ( 1 ) "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
11  ( 1 ) "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
12  ( 1 ) "*"    "*"   "*"    "*"   "*" "*"       "*"          "*"       
          MarriedYes EthnicityAsian EthnicityCaucasian Utilization
1  ( 1 )  " "        " "            " "                " "        
2  ( 1 )  " "        " "            " "                "*"        
3  ( 1 )  " "        " "            " "                "*"        
4  ( 1 )  " "        " "            " "                "*"        
5  ( 1 )  " "        " "            " "                "*"        
6  ( 1 )  " "        " "            " "                "*"        
7  ( 1 )  " "        " "            " "                "*"        
8  ( 1 )  " "        " "            " "                "*"        
9  ( 1 )  "*"        " "            " "                "*"        
10  ( 1 ) "*"        "*"            " "                "*"        
11  ( 1 ) "*"        "*"            "*"                "*"        
12  ( 1 ) "*"        "*"            "*"                "*"        
reg.summary <- summary(regfit.fwd)

Picking

par(mfrow = c(2, 2))
plot(reg.summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg.summary$adjr2, xlab = "Number of Variables", ylab = " Adjusted RSq", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red",cex = 2, pch = 20)
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp",
type ="l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC",
type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)

par(mfrow = c(1, 1))

Using the build in leaps plotting functions

par(mfrow = c(2, 2))
plot(regfit.fwd, scale = "r2")
plot(regfit.fwd, scale = "adjr2")
plot(regfit.fwd, scale = "Cp")
plot(regfit.fwd, scale = "bic")

par(mfrow = c(1, 1))

What are the coefficients selected with BIC?

which.min(reg.summary$bic)
[1] 6
coef(regfit.fwd, which.min(reg.summary$bic))
 (Intercept)       Income        Limit       Rating        Cards 
-516.3833320   -6.9477878    0.1823182    1.0605783   15.9497347 
  StudentYes  Utilization 
 403.9421811  153.2844256 

Backward selection with leaps

regfit.bwd <- regsubsets(Balance ~. , data = Credit , nvmax = 12,
method = "backward")
summary(regfit.bwd)
Subset selection object
Call: regsubsets.formula(Balance ~ ., data = Credit, nvmax = 12, method = "backward")
12 Variables  (and intercept)
                   Forced in Forced out
Income                 FALSE      FALSE
Limit                  FALSE      FALSE
Rating                 FALSE      FALSE
Cards                  FALSE      FALSE
Age                    FALSE      FALSE
Education              FALSE      FALSE
GenderFemale           FALSE      FALSE
StudentYes             FALSE      FALSE
MarriedYes             FALSE      FALSE
EthnicityAsian         FALSE      FALSE
EthnicityCaucasian     FALSE      FALSE
Utilization            FALSE      FALSE
1 subsets of each size up to 12
Selection Algorithm: backward
          Income Limit Rating Cards Age Education GenderFemale StudentYes
1  ( 1 )  " "    "*"   " "    " "   " " " "       " "          " "       
2  ( 1 )  "*"    "*"   " "    " "   " " " "       " "          " "       
3  ( 1 )  "*"    "*"   " "    " "   " " " "       " "          "*"       
4  ( 1 )  "*"    "*"   " "    "*"   " " " "       " "          "*"       
5  ( 1 )  "*"    "*"   " "    "*"   " " " "       " "          "*"       
6  ( 1 )  "*"    "*"   "*"    "*"   " " " "       " "          "*"       
7  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       " "          "*"       
8  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
9  ( 1 )  "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
10  ( 1 ) "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
11  ( 1 ) "*"    "*"   "*"    "*"   "*" " "       "*"          "*"       
12  ( 1 ) "*"    "*"   "*"    "*"   "*" "*"       "*"          "*"       
          MarriedYes EthnicityAsian EthnicityCaucasian Utilization
1  ( 1 )  " "        " "            " "                " "        
2  ( 1 )  " "        " "            " "                " "        
3  ( 1 )  " "        " "            " "                " "        
4  ( 1 )  " "        " "            " "                " "        
5  ( 1 )  " "        " "            " "                "*"        
6  ( 1 )  " "        " "            " "                "*"        
7  ( 1 )  " "        " "            " "                "*"        
8  ( 1 )  " "        " "            " "                "*"        
9  ( 1 )  "*"        " "            " "                "*"        
10  ( 1 ) "*"        "*"            " "                "*"        
11  ( 1 ) "*"        "*"            "*"                "*"        
12  ( 1 ) "*"        "*"            "*"                "*"        
reg.summary <- summary(regfit.bwd)

Picking

par(mfrow = c(2, 2))
plot(reg.summary$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
plot(reg.summary$adjr2, xlab = "Number of Variables", ylab = " Adjusted RSq", type = "l")
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col = "red",cex = 2, pch = 20)
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp",
type ="l")
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col = "red", cex = 2, pch = 20)
plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC",
type = "l")
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col = "red", cex = 2, pch = 20)

par(mfrow = c(1, 1))

Using the build in leaps plotting functions

par(mfrow = c(2, 2))
plot(regfit.bwd, scale = "r2")
plot(regfit.bwd, scale = "adjr2")
plot(regfit.bwd, scale = "Cp")
plot(regfit.bwd, scale = "bic")

par(mfrow = c(1, 1))

What are the coefficients selected with BIC?

which.min(reg.summary$bic)
[1] 5
coef(regfit.bwd, which.min(reg.summary$bic))
 (Intercept)       Income        Limit        Cards   StudentYes 
-490.4886212   -6.8997534    0.2525404   21.1105879  406.3008916 
 Utilization 
 155.4748273 

Different Models?

coef(regfit.full, 7)
 (Intercept)       Income        Limit       Rating        Cards 
-487.7563318   -6.9233687    0.1822902    1.0649244   16.3703375 
         Age   StudentYes  Utilization 
  -0.5606190  403.9969037  145.4632091 
coef(regfit.fwd, 7)
 (Intercept)       Income        Limit       Rating        Cards 
-487.7563318   -6.9233687    0.1822902    1.0649244   16.3703375 
         Age   StudentYes  Utilization 
  -0.5606190  403.9969037  145.4632091 
coef(regfit.bwd, 7)
 (Intercept)       Income        Limit       Rating        Cards 
-487.7563318   -6.9233687    0.1822902    1.0649244   16.3703375 
         Age   StudentYes  Utilization 
  -0.5606190  403.9969037  145.4632091 

Choosing among models using validation set approach

set.seed(23)
train = sample(c(TRUE, FALSE), size = nrow(Credit), replace = TRUE)
test <- (!train)
regfit.best <- regsubsets(Balance ~ ., data = Credit[train, ], nvmax = 12)
test.mat <- model.matrix(Balance ~ ., data = Credit[test, ])
val.errors <- numeric(12)
for(i in 1:12){
  coefi <- coef(regfit.best, id = i)
  pred <- test.mat[, names(coefi)]%*%coefi
  val.errors[i] <- mean((Credit$Balance[test] - pred)^2)
}
val.errors
 [1] 55718.601 28715.092 10317.674  9503.439  9646.849  9563.511  9467.747
 [8]  9502.233  9497.102  9472.081  9462.644  9452.881
which.min(val.errors)
[1] 12
coef(regfit.best, which.min(val.errors))
       (Intercept)             Income              Limit 
      -473.7864656         -6.5997774          0.1711735 
            Rating              Cards                Age 
         1.2141640         16.2858289         -0.9600311 
         Education       GenderFemale         StudentYes 
        -0.4846770        -17.7306999        389.3311273 
        MarriedYes     EthnicityAsian EthnicityCaucasian 
        -3.2296938         10.4894250          2.1090643 
       Utilization 
       195.2645556 

Creating a predict function for regsubsets

predict.regsubsets=function(object,newdata ,id ,...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form,newdata)
coefi <- coef(object,id=id)
xvars <- names(coefi)
mat[,xvars]%*%coefi
}

Choosing among models of different sizes with cross validation

k <- 5
set.seed(1)
folds <- sample(1:k, size = nrow(Credit), replace = TRUE)
cv.errors <- matrix(NA, k, 12, dimnames = list(NULL, paste(1:12)))
#
for(j in 1:k){
  best.fit <- regsubsets(Balance ~ ., data = Credit[folds != j, ], nvmax = 12)
  for(i in 1:12){
    pred <- predict(best.fit, Credit[folds ==j,], id = i)
    cv.errors[j, i] <- mean((Credit$Balance[folds==j] - pred)^2)
  }
}
mean.cv.errors <- apply(cv.errors, 2, mean)
mean.cv.errors
       1        2        3        4        5        6        7        8 
54889.41 24995.29 11835.29 10887.59 10671.61 10279.28 10152.01 10403.26 
       9       10       11       12 
10427.87 10410.61 10402.15 10365.81 
which.min(mean.cv.errors)
7 
7 
plot(mean.cv.errors, type = "b")

Note that the best model contains 7 variables. We now perform best subset selection of the full data set in order to obtain the 7-variable model.

reg.best <- regsubsets(Balance ~ ., data = Credit, nvmax = 12)
coef(reg.best, which.min(mean.cv.errors))
 (Intercept)       Income        Limit       Rating        Cards 
-487.7563318   -6.9233687    0.1822902    1.0649244   16.3703375 
         Age   StudentYes  Utilization 
  -0.5606190  403.9969037  145.4632091 
coef(reg.best, 3) # The curve really does not drop much after 3...
(Intercept)      Income      Rating  StudentYes 
-581.078888   -7.874931    3.987472  418.760284 
mymod <- lm(Balance ~ Income + Rating +  Student, data = Credit)
summary(mymod)

Call:
lm(formula = Balance ~ Income + Rating + Student, data = Credit)

Residuals:
     Min       1Q   Median       3Q      Max 
-226.126  -80.445   -5.018   65.192  293.234 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -581.07889   13.83463  -42.00   <2e-16 ***
Income        -7.87493    0.24021  -32.78   <2e-16 ***
Rating         3.98747    0.05471   72.89   <2e-16 ***
StudentYes   418.76028   17.23025   24.30   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 103.3 on 396 degrees of freedom
Multiple R-squared:  0.9499,    Adjusted R-squared:  0.9495 
F-statistic:  2502 on 3 and 396 DF,  p-value: < 2.2e-16

Using stepAIC

library(MASS)
mod.fs <- stepAIC(lm(Balance ~ 1, data = Credit), scope = .~Income + Limit + Cards + Age + Education + Gender + Student + Married + Ethnicity + Rating + Utilization, direction = "forward", test = "F")
Start:  AIC=4905.56
Balance ~ 1

              Df Sum of Sq      RSS    AIC F Value     Pr(F)    
+ Rating       1  62904790 21435122 4359.6 1167.99 < 2.2e-16 ***
+ Limit        1  62624255 21715657 4364.8 1147.76 < 2.2e-16 ***
+ Utilization  1  27382381 56957530 4750.5  191.34 < 2.2e-16 ***
+ Income       1  18131167 66208745 4810.7  108.99 < 2.2e-16 ***
+ Student      1   5658372 78681540 4879.8   28.62 1.488e-07 ***
+ Cards        1    630416 83709496 4904.6    3.00   0.08418 .  
<none>                     84339912 4905.6                      
+ Gender       1     38892 84301020 4907.4    0.18   0.66852    
+ Education    1      5481 84334431 4907.5    0.03   0.87231    
+ Married      1      2715 84337197 4907.5    0.01   0.90994    
+ Age          1       284 84339628 4907.6    0.00   0.97081    
+ Ethnicity    2     18454 84321458 4909.5    0.04   0.95749    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=4359.63
Balance ~ Rating

              Df Sum of Sq      RSS    AIC F Value     Pr(F)    
+ Utilization  1  11779424  9655698 4042.6  484.32 < 2.2e-16 ***
+ Income       1  10902581 10532541 4077.4  410.95 < 2.2e-16 ***
+ Student      1   5735163 15699959 4237.1  145.02 < 2.2e-16 ***
+ Age          1    649110 20786012 4349.3   12.40 0.0004798 ***
+ Cards        1    138580 21296542 4359.0    2.58 0.1087889    
+ Married      1    118209 21316913 4359.4    2.20 0.1386707    
<none>                     21435122 4359.6                      
+ Education    1     27243 21407879 4361.1    0.51 0.4776403    
+ Gender       1     16065 21419057 4361.3    0.30 0.5855899    
+ Limit        1      7960 21427162 4361.5    0.15 0.7011619    
+ Ethnicity    2     51100 21384022 4362.7    0.47 0.6233922    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=4042.64
Balance ~ Rating + Utilization

            Df Sum of Sq     RSS    AIC F Value     Pr(F)    
+ Student    1   2671767 6983931 3915.1 151.493 < 2.2e-16 ***
+ Income     1   1025771 8629927 3999.7  47.069  2.65e-11 ***
+ Married    1     95060 9560638 4040.7   3.937   0.04791 *  
+ Age        1     50502 9605197 4042.5   2.082   0.14983    
<none>                   9655698 4042.6                      
+ Limit      1     42855 9612843 4042.9   1.765   0.18472    
+ Education  1     28909 9626789 4043.4   1.189   0.27616    
+ Gender     1      7187 9648511 4044.3   0.295   0.58735    
+ Cards      1      3371 9652327 4044.5   0.138   0.71017    
+ Ethnicity  2     13259 9642439 4046.1   0.272   0.76231    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3915.06
Balance ~ Rating + Utilization + Student

            Df Sum of Sq     RSS    AIC F Value   Pr(F)    
+ Income     1   2893712 4090219 3703.1 279.451 < 2e-16 ***
+ Limit      1     77766 6906165 3912.6   4.448 0.03557 *  
+ Age        1     58618 6925313 3913.7   3.343 0.06823 .  
<none>                   6983931 3915.1                    
+ Married    1     33686 6950245 3915.1   1.914 0.16725    
+ Education  1      2344 6981587 3916.9   0.133 0.71591    
+ Cards      1      1302 6982630 3917.0   0.074 0.78627    
+ Gender     1         9 6983922 3917.1   0.001 0.98212    
+ Ethnicity  2      1715 6982217 3919.0   0.048 0.95278    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3703.06
Balance ~ Rating + Utilization + Student + Income

            Df Sum of Sq     RSS    AIC F Value     Pr(F)    
+ Limit      1    178086 3912133 3687.3 17.9354 2.847e-05 ***
+ Age        1     34096 4056122 3701.7  3.3120   0.06953 .  
<none>                   4090219 3703.1                      
+ Married    1     15941 4074278 3703.5  1.5416   0.21512    
+ Gender     1      8880 4081339 3704.2  0.8572   0.35508    
+ Cards      1      4628 4085591 3704.6  0.4463   0.50447    
+ Education  1       445 4089774 3705.0  0.0428   0.83613    
+ Ethnicity  2     16108 4074111 3705.5  0.7769   0.46054    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3687.25
Balance ~ Rating + Utilization + Student + Income + Limit

            Df Sum of Sq     RSS    AIC F Value     Pr(F)    
+ Cards      1    129913 3782220 3675.7 13.4989 0.0002718 ***
+ Age        1     29075 3883058 3686.3  2.9427 0.0870572 .  
<none>                   3912133 3687.3                      
+ Gender     1     10045 3902089 3688.2  1.0116 0.3151296    
+ Married    1      8872 3903262 3688.3  0.8932 0.3451820    
+ Education  1      3501 3908633 3688.9  0.3520 0.5533444    
+ Ethnicity  2     12590 3899543 3690.0  0.6328 0.5316436    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3675.74
Balance ~ Rating + Utilization + Student + Income + Limit + Cards

            Df Sum of Sq     RSS    AIC F Value   Pr(F)  
+ Age        1     35671 3746548 3674.0  3.7323 0.05409 .
<none>                   3782220 3675.7                  
+ Gender     1      8945 3773275 3676.8  0.9293 0.33564  
+ Married    1      4801 3777419 3677.2  0.4982 0.48069  
+ Education  1      3733 3778487 3677.3  0.3873 0.53408  
+ Ethnicity  2     10981 3771239 3678.6  0.5693 0.56641  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3673.95
Balance ~ Rating + Utilization + Student + Income + Limit + Cards + 
    Age

            Df Sum of Sq     RSS    AIC F Value  Pr(F)
<none>                   3746548 3674.0               
+ Gender     1    8668.5 3737880 3675.0 0.90676 0.3416
+ Married    1    7191.5 3739357 3675.2 0.75197 0.3864
+ Education  1    3505.2 3743043 3675.6 0.36616 0.5455
+ Ethnicity  2    8615.0 3737933 3677.0 0.44943 0.6383
mod.be <- stepAIC(lm(Balance ~ Income + Limit + Cards + Age + Education + Gender + Student + Married + Ethnicity + Rating + Utilization, data = Credit), direction = "backward", test = "F")
Start:  AIC=3680.77
Balance ~ Income + Limit + Cards + Age + Education + Gender + 
    Student + Married + Ethnicity + Rating + Utilization

              Df Sum of Sq     RSS    AIC F Value     Pr(F)    
- Ethnicity    2     11142 3727965 3678.0    0.58 0.5603551    
- Education    1      2957 3719780 3679.1    0.31 0.5793166    
- Married      1      8329 3725153 3679.7    0.87 0.3522921    
- Gender       1      8958 3725782 3679.7    0.93 0.3347539    
<none>                     3716824 3680.8                      
- Age          1     35042 3751866 3682.5    3.65 0.0568551 .  
- Rating       1     50626 3767450 3684.2    5.27 0.0222141 *  
- Utilization  1     69907 3786730 3686.2    7.28 0.0072828 ** 
- Cards        1    128547 3845371 3692.4   13.38 0.0002889 ***
- Limit        1    287249 4004073 3708.5   29.91 8.141e-08 ***
- Income       1   3046715 6763538 3918.2  317.23 < 2.2e-16 ***
- Student      1   4650015 8366839 4003.3  484.17 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3677.96
Balance ~ Income + Limit + Cards + Age + Education + Gender + 
    Student + Married + Rating + Utilization

              Df Sum of Sq     RSS    AIC F Value     Pr(F)    
- Education    1      3019 3730984 3676.3    0.31 0.5749597    
- Married      1      6271 3734237 3676.6    0.65 0.4190419    
- Gender       1      8509 3736474 3676.9    0.89 0.3466409    
<none>                     3727965 3678.0                      
- Age          1     37386 3765351 3680.0    3.90 0.0489614 *  
- Rating       1     48086 3776052 3681.1    5.02 0.0256547 *  
- Utilization  1     72849 3800814 3683.7    7.60 0.0061067 ** 
- Cards        1    131187 3859153 3689.8   13.69 0.0002468 ***
- Limit        1    294067 4022032 3706.3   30.68   5.6e-08 ***
- Income       1   3036882 6764847 3914.3  316.89 < 2.2e-16 ***
- Student      1   4668283 8396249 4000.7  487.12 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3676.29
Balance ~ Income + Limit + Cards + Age + Gender + Student + Married + 
    Rating + Utilization

              Df Sum of Sq     RSS    AIC F Value     Pr(F)    
- Married      1      6896 3737880 3675.0    0.72 0.3963978    
- Gender       1      8373 3739357 3675.2    0.88 0.3500974    
<none>                     3730984 3676.3                      
- Age          1     37726 3768710 3678.3    3.94 0.0477529 *  
- Rating       1     50282 3781266 3679.6    5.26 0.0224028 *  
- Utilization  1     74587 3805572 3682.2    7.80 0.0054920 ** 
- Cards        1    130839 3861823 3688.1   13.68 0.0002483 ***
- Limit        1    291132 4022117 3704.3   30.43  6.31e-08 ***
- Income       1   3035245 6766229 3912.4  317.27 < 2.2e-16 ***
- Student      1   4689629 8420613 3999.9  490.21 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3675.03
Balance ~ Income + Limit + Cards + Age + Gender + Student + Rating + 
    Utilization

              Df Sum of Sq     RSS    AIC F Value     Pr(F)    
- Gender       1      8668 3746548 3674.0    0.91 0.3415625    
<none>                     3737880 3675.0                      
- Age          1     35395 3773275 3676.8    3.70 0.0550578 .  
- Rating       1     47158 3785038 3678.0    4.93 0.0269210 *  
- Utilization  1     72879 3810759 3680.7    7.62 0.0060328 ** 
- Cards        1    135372 3873252 3687.3   14.16 0.0001936 ***
- Limit        1    303600 4041480 3704.3   31.76 3.347e-08 ***
- Income       1   3056864 6794744 3912.1  319.76 < 2.2e-16 ***
- Student      1   4770749 8508629 4002.1  499.04 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3673.95
Balance ~ Income + Limit + Cards + Age + Student + Rating + Utilization

              Df Sum of Sq     RSS    AIC F Value     Pr(F)    
<none>                     3746548 3674.0                      
- Age          1     35671 3782220 3675.7    3.73 0.0540906 .  
- Rating       1     46902 3793451 3676.9    4.91 0.0273163 *  
- Utilization  1     75071 3821620 3679.9    7.85 0.0053206 ** 
- Cards        1    136510 3883058 3686.3   14.28 0.0001817 ***
- Limit        1    303278 4049826 3703.1   31.73 3.383e-08 ***
- Income       1   3048196 6794744 3910.1  318.93 < 2.2e-16 ***
- Student      1   4765085 8511634 4000.2  498.57 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod.fs)

Call:
lm(formula = Balance ~ Rating + Utilization + Student + Income + 
    Limit + Cards + Age, data = Credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-192.01  -77.03  -14.61   57.03  308.37 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -487.75633   24.70331 -19.745  < 2e-16 ***
Rating         1.06492    0.48072   2.215 0.027316 *  
Utilization  145.46321   51.90256   2.803 0.005321 ** 
StudentYes   403.99690   18.09320  22.329  < 2e-16 ***
Income        -6.92337    0.38768 -17.859  < 2e-16 ***
Limit          0.18229    0.03236   5.633 3.38e-08 ***
Cards         16.37034    4.33160   3.779 0.000182 ***
Age           -0.56062    0.29019  -1.932 0.054091 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 97.76 on 392 degrees of freedom
Multiple R-squared:  0.9556,    Adjusted R-squared:  0.9548 
F-statistic:  1205 on 7 and 392 DF,  p-value: < 2.2e-16
summary(mod.be) 

Call:
lm(formula = Balance ~ Income + Limit + Cards + Age + Student + 
    Rating + Utilization, data = Credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-192.01  -77.03  -14.61   57.03  308.37 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -487.75633   24.70331 -19.745  < 2e-16 ***
Income        -6.92337    0.38768 -17.859  < 2e-16 ***
Limit          0.18229    0.03236   5.633 3.38e-08 ***
Cards         16.37034    4.33160   3.779 0.000182 ***
Age           -0.56062    0.29019  -1.932 0.054091 .  
StudentYes   403.99690   18.09320  22.329  < 2e-16 ***
Rating         1.06492    0.48072   2.215 0.027316 *  
Utilization  145.46321   51.90256   2.803 0.005321 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 97.76 on 392 degrees of freedom
Multiple R-squared:  0.9556,    Adjusted R-squared:  0.9548 
F-statistic:  1205 on 7 and 392 DF,  p-value: < 2.2e-16
car::vif(mod.be)
     Income       Limit       Cards         Age     Student      Rating 
   7.793671  232.919318    1.472901    1.046060    1.233070  230.957276 
Utilization 
   3.323397 
car::vif(mod.fs)
     Rating Utilization     Student      Income       Limit       Cards 
 230.957276    3.323397    1.233070    7.793671  232.919318    1.472901 
        Age 
   1.046060 

Ridge Regression

library(glmnet)
x <- model.matrix(Balance ~ ., data = Credit)[, -1]
y <- Credit$Balance
grid <- 10^seq(10, -2, length = 100)
ridge.mod <- glmnet(x[train,], y[train], alpha = 0, lambda = grid)
dim(coef(ridge.mod))
[1]  13 100
plot(ridge.mod, xvar = "lambda", label = TRUE)

set.seed(123)
cv.out <- cv.glmnet(x[train,], y[train], alpha = 0)
plot(cv.out)

bestlambda <- cv.out$lambda.min
bestlambda
[1] 46.61169
ridge.pred <- predict(ridge.mod, s = bestlambda, newx = x[test, ])
mean((ridge.pred - y[test])^2)
[1] 15981.45
final <- glmnet(x, y, alpha = 0)
predict(final, type = "coefficients", s = bestlambda)
13 x 1 sparse Matrix of class "dgCMatrix"
                               1
(Intercept)        -392.65040251
Income               -2.06928408
Limit                 0.08960334
Rating                1.30869128
Cards                 8.83592621
Age                  -0.57656948
Education             0.25626747
GenderFemale         -1.19561822
StudentYes          291.33552316
MarriedYes          -14.71978788
EthnicityAsian        5.88608786
EthnicityCaucasian    3.29195248
Utilization         643.41091235

Lasso Regression

x <- model.matrix(Balance ~ ., data = Credit)[, -1]
y <- Credit$Balance
grid <- 10^seq(10, -2, length = 100)
lasso.mod <- glmnet(x[train,], y[train], lambda = grid)
dim(coef(lasso.mod))
[1]  13 100
plot(lasso.mod, xvar = "lambda", label = TRUE)

plot(lasso.mod, xvar = "dev", label = TRUE)

set.seed(123)
cv.out <- cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv.out)

bestlambda <- cv.out$lambda.min
bestlambda
[1] 0.7596544
lasso.pred <- predict(lasso.mod, s = bestlambda, newx = x[test, ])
mean((lasso.pred - y[test])^2)
[1] 9549.177
final <- glmnet(x, y, alpha = 1, lambda = grid)
predict(final, type = "coefficients", s = bestlambda)
13 x 1 sparse Matrix of class "dgCMatrix"
                              1
(Intercept)        -477.0705863
Income               -6.6583692
Limit                 0.1642959
Rating                1.2748920
Cards                14.2920828
Age                  -0.5216539
Education            -0.5491745
GenderFemale         -7.4667504
StudentYes          396.0657365
MarriedYes           -8.4455837
EthnicityAsian       10.4355721
EthnicityCaucasian    4.7946826
Utilization         173.0187636
predict(final, type = "coefficients", s = 20)
13 x 1 sparse Matrix of class "dgCMatrix"
                               1
(Intercept)        -386.98601416
Income               -0.38042763
Limit                 0.06244626
Rating                1.36394072
Cards                 .         
Age                   .         
Education             .         
GenderFemale          .         
StudentYes          230.48485806
MarriedYes            .         
EthnicityAsian        .         
EthnicityCaucasian    .         
Utilization         802.36871629

Changing the problem now

Response is now Rating

  • Create a model that predicts Rating with Limit, Cards, Married, Student, and Education as features.
mod <- lm(Rating ~ Limit + Cards + Married + Student + Education, data = Credit)
summary(mod)

Call:
lm(formula = Rating ~ Limit + Cards + Married + Student + Education, 
    data = Credit)

Residuals:
     Min       1Q   Median       3Q      Max 
-22.3855  -6.9708  -0.8064   6.4644  26.0040 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 26.2320857  2.8284022   9.275   <2e-16 ***
Limit        0.0667736  0.0002212 301.902   <2e-16 ***
Cards        4.8520572  0.3725931  13.022   <2e-16 ***
MarriedYes   2.1732013  1.0509888   2.068   0.0393 *  
StudentYes   3.0880657  1.7086441   1.807   0.0715 .  
Education   -0.2598468  0.1641332  -1.583   0.1142    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.19 on 394 degrees of freedom
Multiple R-squared:  0.9957,    Adjusted R-squared:  0.9957 
F-statistic: 1.832e+04 on 5 and 394 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(mod)

par(mfrow = c(1, 1))
car::residualPlots(mod)

           Test stat Pr(>|t|)
Limit          2.157    0.032
Cards         -2.286    0.023
Married           NA       NA
Student           NA       NA
Education      1.174    0.241
Tukey test     2.115    0.034
modN <- lm(Rating ~ poly(Limit, 2, raw = TRUE) + poly(Cards, 2, raw = TRUE) + Married + Student + Education, data = Credit)
summary(modN)

Call:
lm(formula = Rating ~ poly(Limit, 2, raw = TRUE) + poly(Cards, 
    2, raw = TRUE) + Married + Student + Education, data = Credit)

Residuals:
     Min       1Q   Median       3Q      Max 
-27.8814  -6.8317  -0.3358   6.5136  25.9925 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  2.579e+01  3.816e+00   6.760 5.01e-11 ***
poly(Limit, 2, raw = TRUE)1  6.529e-02  7.506e-04  86.984  < 2e-16 ***
poly(Limit, 2, raw = TRUE)2  1.320e-07  6.297e-08   2.096   0.0368 *  
poly(Cards, 2, raw = TRUE)1  7.615e+00  1.301e+00   5.855 1.01e-08 ***
poly(Cards, 2, raw = TRUE)2 -3.972e-01  1.783e-01  -2.228   0.0264 *  
MarriedYes                   2.295e+00  1.043e+00   2.199   0.0285 *  
StudentYes                   3.159e+00  1.693e+00   1.866   0.0628 .  
Education                   -2.774e-01  1.627e-01  -1.705   0.0889 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.09 on 392 degrees of freedom
Multiple R-squared:  0.9958,    Adjusted R-squared:  0.9957 
F-statistic: 1.334e+04 on 7 and 392 DF,  p-value: < 2.2e-16
car::residualPlots(modN)

                           Test stat Pr(>|t|)
poly(Limit, 2, raw = TRUE)        NA       NA
poly(Cards, 2, raw = TRUE)        NA       NA
Married                           NA       NA
Student                           NA       NA
Education                      1.271    0.204
Tukey test                    -0.782    0.434
car::vif(modN)
                               GVIF Df GVIF^(1/(2*Df))
poly(Limit, 2, raw = TRUE) 1.006987  2        1.001742
poly(Cards, 2, raw = TRUE) 1.011571  2        1.002880
Married                    1.014970  1        1.007457
Student                    1.012868  1        1.006413
Education                  1.012733  1        1.006346
summary(modN)

Call:
lm(formula = Rating ~ poly(Limit, 2, raw = TRUE) + poly(Cards, 
    2, raw = TRUE) + Married + Student + Education, data = Credit)

Residuals:
     Min       1Q   Median       3Q      Max 
-27.8814  -6.8317  -0.3358   6.5136  25.9925 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  2.579e+01  3.816e+00   6.760 5.01e-11 ***
poly(Limit, 2, raw = TRUE)1  6.529e-02  7.506e-04  86.984  < 2e-16 ***
poly(Limit, 2, raw = TRUE)2  1.320e-07  6.297e-08   2.096   0.0368 *  
poly(Cards, 2, raw = TRUE)1  7.615e+00  1.301e+00   5.855 1.01e-08 ***
poly(Cards, 2, raw = TRUE)2 -3.972e-01  1.783e-01  -2.228   0.0264 *  
MarriedYes                   2.295e+00  1.043e+00   2.199   0.0285 *  
StudentYes                   3.159e+00  1.693e+00   1.866   0.0628 .  
Education                   -2.774e-01  1.627e-01  -1.705   0.0889 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.09 on 392 degrees of freedom
Multiple R-squared:  0.9958,    Adjusted R-squared:  0.9957 
F-statistic: 1.334e+04 on 7 and 392 DF,  p-value: < 2.2e-16
  • Use your model to predict the Rating for an individual that has a credit card limit of $6,000, has 4 credit cards, is married, and is not a student, and has an undergraduate degree (Education = 16).

  • Use your model to predict the Rating for an individual that has a credit card limit of $12,000, has 2 credit cards, is married, is not a student, and has an eighth grade education (Education = 8).

predict(modN, newdata = data.frame(Limit = 6000, Cards = 4, Married = "Yes", Student = "No", Education = 16), response = "pred")
       1 
444.2526 
### Should be the same as:
coef(modN)[1] + coef(modN)[2]*6000 + coef(modN)[3]*6000^2 + coef(modN)[4]*4 + coef(modN)[5]*4^2 + coef(modN)[6]*1 + coef(modN)[7]*0 + coef(modN)[8]*16
(Intercept) 
   444.2526 
predict(modN, newdata = data.frame(Limit = 12000, Cards = 2, Married = "Yes", Student = "No", Education = 8), response = "pred")
       1 
842.0091